One month of data has been gathered about car ads. eBay Classifieds is all about brining buyers and sellers together to make a great deal. Sellers place ads, and buyers look for these ads by browsing or searching on the site. When a buyer finds an ad they find interesting in the result list and clicks it, we call this a ‘View Item page’- a VIP view. When a buyer proceeds to contact a seller to get more info or strike a deal, we call this a lead. They can do so by calling (Phone click), asking a question over email (ASQ: ask seller Question), clicking out to a seller’s website (URL_CLICK) or placing a bid.

Lets check what we can infer from this data and what type of predictive model we will be able to build.

Metadata:

Lets begin by reading in and examining our data

# Importing the Data
## Setup how the classes will be read in

class <- c( "numeric",  "numeric", "numeric",   "character",    "character",    "numeric",  "numeric",
           "numeric", "numeric",    "numeric",  "numeric",  "numeric",  "factor",   "character",    "numeric",
           "date",  "numeric", "numeric",   "character",    "numeric",  "numeric",  "character")


path <- c("/Users/mustafaozturk/Desktop/eBay Case/DataSet/cars_dataset_may2019.csv")
## Read in and examine the data
cars <- data.table::fread(path, colClasses = class)

initial findings:

#In some cases like below telclicks, bids and webclicks contains NA. It can be converted to 0
cars[src_ad_id==1063865800,]
##     src_ad_id telclicks bids kleur carrosserie kmstand days_live photo_cnt
## 1: 1063865800        NA   NA Zwart         MPV  130773         0         7
##    aantaldeuren n_asq bouwjaar emissie energielabel brand   l2 ad_start_dt
## 1:            5     0     2005     190            D MAZDA None  11/15/2016
##    vermogen webclicks model aantalstoelen price group
## 1:       85        NA     5             7 59500     B
#Converting NA to 0
cars$telclicks <- ifelse(is.na(cars$telclicks), "0", cars$telclicks)
cars$bids <- ifelse(is.na(cars$bids), "0", cars$bids)
cars$webclicks <- ifelse(is.na(cars$webclicks), "0", cars$webclicks)
#In some rows of ad_starts contains "." it should change to "/" when I checked the excel file. 
#But it has been taken care by R

table(cars$ad_start_dt)
## 
##  11/1/2016 11/10/2016 11/11/2016 11/12/2016 11/13/2016 11/14/2016 
##       7138       7054       7685       5809       3298       6302 
## 11/15/2016 11/16/2016 11/17/2016 11/18/2016 11/19/2016  11/2/2016 
##       6319       6108       7365       7092       5625       5670 
## 11/20/2016 11/21/2016 11/22/2016 11/23/2016 11/24/2016 11/25/2016 
##       3112       6695       6786       5464       6532       6988 
## 11/26/2016 11/27/2016 11/28/2016 11/29/2016  11/3/2016 11/30/2016 
##       5785       3205       8460       6111       7103       5540 
##  11/4/2016  11/5/2016  11/6/2016  11/7/2016  11/8/2016  11/9/2016 
##       7987       5710       2986       6309       6930       5894
#Changing NA, None and ? to .
cars[cars == "None"] <- NA
cars[cars == "?"] <- NA
# Summary Statistics
str(cars)
## Classes 'data.table' and 'data.frame':   183062 obs. of  22 variables:
##  $ src_ad_id    : num  1.12e+09 1.02e+09 1.09e+09 1.15e+09 1.06e+09 ...
##  $ telclicks    : chr  "1" "0" "0" "3" ...
##  $ bids         : chr  "0" "0" "0" "0" ...
##  $ kleur        : chr  "Zilver of Grijs" "Zilver of Grijs" "Zilver of Grijs" "Zwart" ...
##  $ carrosserie  : chr  "Hatchback (3/5-deurs)" "Hatchback (3/5-deurs)" "Stationwagon" "Sedan (2/4-deurs)" ...
##  $ kmstand      : num  44958 25072 301409 194481 238101 ...
##  $ days_live    : num  31 31 31 31 9 8 2 31 31 100 ...
##  $ photo_cnt    : num  16 22 12 21 8 13 24 14 18 11 ...
##  $ aantaldeuren : chr  "3" "5" "5" "4" ...
##  $ n_asq        : num  0 1 0 1 0 9 0 0 0 10 ...
##  $ bouwjaar     : num  2013 2015 2004 2007 2002 ...
##  $ emissie      : chr  "105" "122" "132" "314" ...
##  $ energielabel : Factor w/ 8 levels "?","A","B","C",..: 2 4 3 8 5 4 4 4 4 3 ...
##  $ brand        : chr  "VOLKSWAGEN" "VOLKSWAGEN" "OPEL" "AUDI" ...
##  $ l2           : chr  NA "157" NA "93" ...
##  $ ad_start_dt  : chr  "11/26/2016" "11/11/2016" "11/2/2016" "11/11/2016" ...
##  $ vermogen     : num  44 92 59 331 79 74 97 81 80 103 ...
##  $ webclicks    : chr  "2" "4" "3" "13" ...
##  $ model        : chr  "UP" "Golf" "Astra" "S8" ...
##  $ aantalstoelen: chr  "4" "5" "5" "5" ...
##  $ price        : num  79500 189500 27500 199000 12500 ...
##  $ group        : chr  "A" "A" "B" "A" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "index")= int 
##   ..- attr(*, "__src_ad_id")= int  97692 149657 33271 112977 94027 107736 57116 55810 109457 3443 ...
# Dimensions
dim(cars)
## [1] 183062     22
# Control the test group and check data

table(cars$group)
## 
##     A     B  NULL 
## 94281 80168  8613
# 8.613 NULL for Group
cars[cars == "NULL"] <- NA
# Data Exploration
## Missing data
MissingValues <- cars %>% summarise_all(funs(sum(is.na(.))/n()))
MissingValues <- gather(MissingValues, key = "feature", value = "MissingPct")
MissingValues %>%
  ggplot(aes(x = reorder(feature, - MissingPct), y = MissingPct)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() + theme_bw()

# A very small percentage of data is NA except l2, energielabel and carrosserie
# May need to do any KNN/RF imputations for l2, energielabel and carrosserie
# Creating new variables
## Create an age variable from date information
## ad start date + days live could be used instead of system time but lets the date as of the date of analysis
## transform some features by year using the new age variable in order to boost our predictive model power

cars <- cars %>% mutate(age = as.numeric(format(Sys.Date(), "%Y")) -
                          as.integer(cars$bouwjaar),
                        annual_emissions = as.numeric(emissie)/age,
                        annual_kms = kmstand / age)
# create an age grouping
cars <- cars %>% mutate(ageGroup = ifelse(age<= 3, "(<=3)", 
                                          ifelse(3 < age & age <= 6, "(4-6)",
                                                 ifelse(5 < age & age <= 10, "(7-10)",
                                                        ifelse(10 < age & age <= 15, "(11-15)",
                                                               ifelse(15 < age & age <= 20, "(16-20)",      
                                                                      "(20+)"))))))
cars$ageGroup <- as.factor(cars$ageGroup)
# Visual Exploration
## Now lets do a visual exploration of our new feature
## View distribution of variable
## Most packed around the 5 Year mark
cars %>% 
  ggplot(aes(x=age))+geom_line(stat="density", color="red", size=1.2)+theme_bw()

# Histogram for a view from another angle by year
ggplot(aes(cars$age), data=cars) +
  geom_histogram(color='white', fill='lightblue') +
  scale_x_continuous(limit=c(0, 35), breaks=seq(0, 35, 2)) +
  labs(x= 'Car Age', y= 'Number of Cars', title= 'Car Age Histogram')

# See if we can unconver anything by segregating by car type
# Vehicle type have a broad spectrum of ages
ggplot(aes(x= carrosserie, y= age), data=cars) +
  geom_boxplot(fill="lightblue", color='red') +
  geom_boxplot(aes(fill = carrosserie)) +
  stat_summary(fun.y = mean, geom="point", size=2) +
  labs(x= 'Vehicle Type', y= 'Age') +
  ggtitle('Age vs. Vehicle Type')

# Examine Car Types
## We have a very high amount of "Hatchbacks" in our dataset
ggplot(cars, aes(x=carrosserie, fill = carrosserie)) + 
  geom_bar() +
  labs(x= 'Vehicle Type', y= 'Number of Cars') +
  ggtitle('Vehicle Type Frequency Diagram')  

# How long before a car is sold?
## Most cars carry on for the 30+ days
ggplot(data=cars, aes(cars$days_live)) + 
  geom_histogram(breaks=seq(0, 35, by = 5), 
                 col="red", 
                 fill="green", 
                 alpha = .2) + 
  labs(title="Histogram for Days Live") +
  labs(x="Days Live", y="Count")

cars$telclicks <- as.numeric(cars$telclicks)
cars$bids <- as.numeric(cars$bids)
cars$webclicks <- as.numeric(cars$webclicks)
# create total clicks variable
cars <- cars %>% mutate(TotalClicks = (telclicks + webclicks + n_asq +bids))
# create the response variable as label
cars$clicked       <- ifelse(cars$TotalClicks > 0, 1, 0)
cars$clicked       <- as.factor(cars$clicked)
# create the response variable 10 days totalclicks
##In order to do that, I will exclude day_lives < 3days.
##I will extrapolated between 3-10 days to 10 days
##I will also find the ratio the 10 days 
cars$TenDaysClick       <- ifelse(cars$days_live < 3, NA, 
                                  ((10*cars$total_clicks)/cars$days_live))
# examine response variable
## as expected, most clicks fall into 0 or 1
ggplot(data=cars, aes(cars$TotalClicks)) + 
  geom_histogram(breaks=seq(0, 35, by = 5), 
                 col="red", 
                 fill="green", 
                 alpha = .2) + 
  labs(title="Histogram for Total Clicks") +
  labs(x="Total Clicks", y="Count")

# now that we have our label, lets examine the A/B test results
## create new table with only A/B test results for later analysis
carsAB <- cars %>% filter(group == "A" | group == "B")
carsA <- cars %>% filter(group == "A")
carsB <- cars %>% filter(group == "B")
### summary(carsA)
### summary(carsB)
## Examining the Data, the only difference between the groups we found was that
## Group A has Higher Mean Price than Group B
t.test(price ~ group, data = carsAB, alternative = "less")
## 
##  Welch Two Sample t-test
## 
## data:  price by group
## t = 2.4035, df = 169663, p-value = 0.9919
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 18349.16
## sample estimates:
## mean in group A mean in group B 
##       106457.10        95563.26
# hypothesis seems to be that price will affect our click rate on ads
# lets test this out, group A has significantly less clicks than group B
t.test(TotalClicks ~ group, data = carsAB, alternative= "greater")
## 
##  Welch Two Sample t-test
## 
## data:  TotalClicks by group
## t = -8.079, df = 167608, p-value = 1
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -0.432395       Inf
## sample estimates:
## mean in group A mean in group B 
##        5.150539        5.509792
# Looks like the groups may be split up to see impact of clicks by price
## lets visualize what that looks like
ggplotscaled<-ggplot(na.omit(carsAB), aes(x = scale(TotalClicks), y = scale(price), color = group)) +
  geom_point() +
  labs(title = "Clicks by Price in A/B Test (Scaled)") +
  labs(color = "Test Groups") +
  labs(x = "Total Clicks", y = "Price")
Clicks by Price

Clicks by Price

# Calculating the confidence intervals for each group by total clicks now
## Group A
errorTCA <- qt(0.90, df=length(carsA$TotalClicks) - 1) * sd(carsA$TotalClicks, na.rm = T) / sqrt(length(carsA$TotalClicks))
leftTCA <- mean(carsA$TotalClicks, na.rm = T) - errorTCA
rightTCA<- mean(carsA$TotalClicks, na.rm = T) + errorTCA
leftTCA; rightTCA
## [1] 5.112747
## [1] 5.188331
# Group B
errorTCB <- qt(0.90, df=length(carsB$TotalClicks) - 1) * sd(carsB$TotalClicks, na.rm = T) / sqrt(length(carsB$TotalClicks))
leftTCB <- mean(carsB$TotalClicks, na.rm = T) - errorTCB
rightTCB <- mean(carsB$TotalClicks, na.rm = T) + errorTCB
leftTCB; rightTCB
## [1] 5.467138
## [1] 5.552446
# Calculate Click change based on price in groups
clicksA <- (leftTCA+rightTCA)/2
clicksB <- (leftTCB+rightTCB)/2
clicksTot <- clicksA + clicksB
conversion <- clicksA/clicksTot - clicksB/clicksTot
rate <- round(conversion * 100, 2)
# Receieved 3.37% less clicks in Group A on Average
rate
## [1] -3.37
# Examine Three Charts Together
## Standard Deviation of Clicks through Days Live
## Median Clicks through Days Live
## Count of Clicks through Days Live
### Again, high amount of observations are on the Hatback and throughout the month decreases
### The abundance of hatchbacks in the early days will skew our A/B Test results for any inference
DaysLiveGroup <- group_by(days_live, carrosserie, .data = carsAB)
DaysClicks <- summarise(DaysLiveGroup,
                         sdClicks = sd(TotalClicks, na.rm = T),
                         medianClicks = median(TotalClicks, na.rm = T),
                         count = n())
p1 <- ggplot(DaysClicks) + 
  geom_smooth(aes(x=days_live, y=sdClicks, color=carrosserie), se = F) + 
  xlim(0,30) +
  labs(color = "Vehicles") +
  labs(x = "Days Live", y = "Deviation of Clicks")
p2 <- ggplot(DaysClicks) + 
  geom_smooth(aes(x=days_live, y=medianClicks, color=carrosserie), se = F) + 
  xlim(0,30) +
  labs(color = "Vehicles") +
  labs(x = "Days Live", y = "Median of Clicks")
p3 <- ggplot(DaysClicks) + 
  geom_smooth(aes(x=days_live, y=count, color=carrosserie), se = F) + 
  xlim(0,30) +
  labs(color = "Vehicles") +
  labs(x = "Days Live", y = "Count of Clicks")
grid.arrange(p1, p2, p3, ncol = 1)

# Created an interactive graph so we can play with the data
## lets examine the count data for the entire lifecycle
### Hatchbacks highly popular within test groups
CarsCount <- ggplot(DaysClicks) + 
  geom_smooth(aes(x=days_live, y=count, color=carrosserie), se = F) + 
  xlim(0,150) +
  labs(title = "Clicks per Vehicle by Days Live") +
  labs(color = "Vehicles") +
  labs(x = "Days Live", y = "Number of Clicks") 
ggplotly(CarsCount)
# Examine some summary statistics of the Hatcback
## it is below the mean/median price of the group
## it has less power than the average car in the group
## it has less kilometers ran on average even though the age is about the same as group
carsAB %>% filter(carrosserie == "Hatchback (3/5-deurs)") %>% 
  select(photo_cnt, vermogen, price, age, kmstand, group) %>% 
  summary()
##    photo_cnt        vermogen          price               age       
##  Min.   : 0.00   Min.   :  0.00   Min.   :       0   Min.   : 3.00  
##  1st Qu.:10.00   1st Qu.: 50.00   1st Qu.:   17500   1st Qu.: 7.00  
##  Median :14.00   Median : 60.00   Median :   49500   Median :11.00  
##  Mean   :14.98   Mean   : 66.49   Mean   :   73295   Mean   :11.65  
##  3rd Qu.:21.00   3rd Qu.: 77.00   3rd Qu.:   99000   3rd Qu.:16.00  
##  Max.   :24.00   Max.   :478.00   Max.   :99999990   Max.   :28.00  
##                                   NA's   :3                         
##     kmstand           group          
##  Min.   :      0   Length:76156      
##  1st Qu.:  59494   Class :character  
##  Median : 113978   Mode  :character  
##  Mean   : 121327                     
##  3rd Qu.: 172052                     
##  Max.   :1455145                     
##  NA's   :512
carsAB %>% filter(carrosserie != "Hatchback (3/5-deurs)") %>% 
  select(photo_cnt, vermogen, price, age, kmstand, group) %>% 
  summary()
##    photo_cnt        vermogen         price               age       
##  Min.   : 0.00   Min.   :  0.0   Min.   :       0   Min.   : 3.00  
##  1st Qu.:11.00   1st Qu.: 77.0   1st Qu.:   25000   1st Qu.: 8.00  
##  Median :17.00   Median : 92.0   Median :   69500   Median :12.00  
##  Mean   :16.54   Mean   :103.9   Mean   :  116992   Mean   :12.08  
##  3rd Qu.:24.00   3rd Qu.:118.0   3rd Qu.:  152000   3rd Qu.:16.00  
##  Max.   :24.00   Max.   :541.0   Max.   :99999990   Max.   :28.00  
##                                  NA's   :2                         
##     kmstand           group          
##  Min.   :      0   Length:81329      
##  1st Qu.:  93037   Class :character  
##  Median : 154642   Mode  :character  
##  Mean   : 158874                     
##  3rd Qu.: 215856                     
##  Max.   :1120000                     
##  NA's   :479
# New table with more evenly distributed car data
carsABRecent <- carsAB %>% filter(carrosserie != "Hatchback (3/5-deurs)")
t.test(TotalClicks ~ group, data = carsABRecent)
## 
##  Welch Two Sample t-test
## 
## data:  TotalClicks by group
## t = -6.7353, df = 78273, p-value = 1.647e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.5129820 -0.2817207
## sample estimates:
## mean in group A mean in group B 
##        4.872662        5.270013
# Calculate Click change based on price in groups
clicksAB <- 4.872662
clicksBA <- 5.270013
clicksTots <- clicksAB + clicksBA
conversion_smooth <- clicksAB/clicksTots - clicksBA/clicksTots
rate_smooth <- round(conversion_smooth * 100, 2)
# Receieved 3.92% less clicks in Group A on Average
# Customers in these samples are more likely to click on cheap cars
# Even when we remove the cheap Hatchback as a highly popular option
rate_smooth
## [1] -3.92
# Having concluded our A/B Analysis, lets go back to our main data
# After examining the variables, we found that many had a "?" or "None" field as factors
# so clean some of the missing/dirty data from these features
cars$kleur         <- as.factor(ifelse(cars$kleur == ".", 
                                       "Other", cars$kleur))
cars$carrosserie   <- as.factor(ifelse(cars$carrosserie == ".",
                                       "Other", cars$carrosserie))
cars$aantaldeuren  <- as.factor(ifelse(cars$aantaldeuren == ".", 
                                       "Other", cars$aantaldeuren))
cars$energielabel  <- as.factor(ifelse(cars$energielabel == ".", 
                                       "Other", cars$energielabel))
cars$aantalstoelen <- as.factor(ifelse(is.na(cars$aantalstoelen), 
                                       "Other", cars$aantalstoelen))
cars$photo_cnt     <- as.factor(cars$photo_cnt)
cars$emissie       <- as.numeric(cars$emissie)
# Drop out any price that is unrealistic
# €0 for a car, or 100 million for a Volvo, etc.
cars$price <- ifelse(cars$price < quantile(cars$price, 0.05, na.rm = T), NA,
                     ifelse(cars$price > quantile(cars$price, 0.98, na.rm = T), NA, cars$price)) 
# "Model" alone has no predictive power but combined with the brand it may
# Combine the Brand and Model of Cars
# Now we can drop "Model" as its mostly noise for our algorithm
cars$brand <- str_replace_all(cars$brand, pattern = "[[:punct:]]", "")
cars$brand <- str_replace_all(cars$brand, pattern = "\\s+", " ")
cars$label <- as.factor(paste(cars$brand, cars$model, sep = " "))
cars$label <- str_replace_all(cars$label, pattern = "[[:punct:]]", "")
cars$label <- str_replace_all(cars$label, pattern = "\\s+", " ")
# Let examine our data and see whats popular for our ads
# Format the Cars Labels
cars$label <- as.factor(tolower(cars$label))
AllLabels <- str_split(cars$label, " ")
# how many words per label
WordsPerLabel <- sapply(AllLabels, length)
# table of frequencies
table(WordsPerLabel)
## WordsPerLabel
##      2      3      4      5      6      7      8      9     10     11 
## 163745  17985    848    370     29     21     11     11      6      8 
##     12     13     14     15     16     18     19 
##      4     10      2      5      4      2      1
# to get it as a percent
100 * round(table(WordsPerLabel)/length(WordsPerLabel), 4)
## WordsPerLabel
##     2     3     4     5     6     7     8     9    10    11    12    13 
## 89.45  9.82  0.46  0.20  0.02  0.01  0.01  0.01  0.00  0.00  0.00  0.01 
##    14    15    16    18    19 
##  0.00  0.00  0.00  0.00  0.00
# vector of words in labels
TitleWords <- unlist(AllLabels)
# get unique words
UniqueWords <- unique(TitleWords)
NumUniqueWords <- length(unique(TitleWords))
# vector to store counts
CountWords <- rep(0, NumUniqueWords)
# count number of occurrences
for (i in 1:NumUniqueWords) {
  CountWords[i] = sum(TitleWords == UniqueWords[i])
}
# index values in decreasing order
Top30Order <- order(CountWords, decreasing = TRUE)[1:30]
# top 30 frequencies
Top30Freqs <- sort(CountWords, decreasing = TRUE)[1:30]
# select top 30 words
Top30Words <- UniqueWords[Top30Order]
# barplot
## Volkswagen seems to be far ahead of the others
barplot(Top30Freqs, border = NA, names.arg = Top30Words,
        las = 2, ylim = c(0,25000))

# Lets see what vehicle type relates to the highest brand
## similar to our AB test results of Hatchback
## the three most popular cars all have Hatchback types
cars %>% 
  group_by(brand, carrosserie) %>% 
  mutate(count = n()) %>%
  select(brand, carrosserie, count) %>%
  arrange(desc(count)) %>% 
  unique() %>% 
  head()
## # A tibble: 6 x 3
## # Groups:   brand, carrosserie [6]
##   brand      carrosserie           count
##   <chr>      <fct>                 <int>
## 1 VOLKSWAGEN Hatchback (3/5-deurs) 11075
## 2 PEUGEOT    Hatchback (3/5-deurs)  8485
## 3 OPEL       Hatchback (3/5-deurs)  6815
## 4 FORD       Hatchback (3/5-deurs)  6196
## 5 RENAULT    Hatchback (3/5-deurs)  5908
## 6 FIAT       Hatchback (3/5-deurs)  5194
# Other features  creating
## We can bin certain variables if they are worthwhile
## Vermogen can be split into High,Low Power
## Emissiens can be split into High, Low Emission Cars
cars %>% select(age, price, kmstand, vermogen, days_live, TotalClicks, emissie) %>%
  cor(use = "complete.obs") %>% corrplot::corrplot()

# Features to Drop
# model has too many factors with no value, date was used in Age
# l2 is unknown but mostly noise
table(cars$l2)
## 
##  100  101  103  105  108  110  111  112  113  114  115  117  118  119  122 
##  245 1948  519  446   81    7 2398 3274   26  513  992  180  118  882    2 
##  123  124  125  127  128  129  130  131  132  133  134  135  138  139  140 
##  100  267  100    2   15  781 2705    2   47  545  607  986 4719   45 3995 
##  143  144  146  147  148  150  151  152  153  154  155  157  158  159 2148 
##    6  186 3927  104  280 1908  661  266  162 1242 1870 8072 2038   91   11 
## 2149 2150 2151 2152 2153 2155 2156 2659 2660 2829 2830 2831   92   93   95 
##    3    2   26    2    2    1    1    5   93    4    9    1  634 2872 5273 
##   96   97   98   99 
## 3687    4   25  470
# Select final features, drop ones we won't use or could cause data leakage
features <- cars %>%
  select(- `group`, - model, - ad_start_dt,
         - src_ad_id, - bouwjaar, - l2, - webclicks, - telclicks, - TotalClicks,
         - n_asq, - bids)
str(features)
## 'data.frame':    183062 obs. of  19 variables:
##  $ kleur           : Factor w/ 9 levels "Beige","Blauw",..: 8 8 8 9 8 9 6 4 4 8 ...
##  $ carrosserie     : Factor w/ 8 levels "Cabriolet","Coup",..: 3 3 7 6 4 7 1 8 3 NA ...
##  $ kmstand         : num  44958 25072 301409 194481 238101 ...
##  $ days_live       : num  31 31 31 31 9 8 2 31 31 100 ...
##  $ photo_cnt       : Factor w/ 25 levels "0","1","2","3",..: 17 23 13 22 9 14 25 15 19 12 ...
##  $ aantaldeuren    : Factor w/ 9 levels "0","1","2","3",..: 4 6 6 5 5 6 3 6 6 6 ...
##  $ emissie         : num  105 122 132 314 178 140 145 128 0 156 ...
##  $ energielabel    : Factor w/ 7 levels "2","3","4","5",..: 1 3 2 7 4 3 3 3 3 2 ...
##  $ brand           : chr  "VOLKSWAGEN" "VOLKSWAGEN" "OPEL" "AUDI" ...
##  $ vermogen        : num  44 92 59 331 79 74 97 81 80 103 ...
##  $ aantalstoelen   : Factor w/ 33 levels "1","10","116 km NAP",..: 14 16 16 16 16 14 14 16 16 16 ...
##  $ price           : num  79500 189500 27500 199000 12500 ...
##  $ age             : num  6 4 15 12 17 13 4 3 18 12 ...
##  $ annual_emissions: num  17.5 30.5 8.8 26.2 10.5 ...
##  $ annual_kms      : num  7493 6268 20094 16207 14006 ...
##  $ ageGroup        : Factor w/ 6 levels "(<=3)","(11-15)",..: 5 5 2 2 3 2 5 1 3 2 ...
##  $ clicked         : Factor w/ 2 levels "0","1": 2 2 2 2 1 2 2 1 1 2 ...
##  $ TenDaysClick    : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ label           : Factor w/ 2332 levels "ackermannfruehauf na",..: 2283 2216 1590 134 1878 1979 1855 1990 1677 750 ...
## Examine the Machine Learning Algorithms we will use
# H2O library was used for performance gains
# Algorithms that can effectively handle NA's were used (RF Imputation was used with no difference)
# Algorithms that can effectively scale were used (YeoJohnson was used with no difference)

carsh2o <- as.h2o(features)
# Split into Training/Validation/Testing sets
splits <- h2o.splitFrame(data = carsh2o, ratios = c(0.7, 0.15), seed = 1)
train <- splits[[1]]
validate <- splits[[2]]
test <- splits[[3]]
# Define Label and Predictors
response <- "TenDaysClick"
predictors <- setdiff(names(train), response)
# Define as Factor since we want to know if its a Click (1) or Not (0)
train[,response] <- as.factor(train[,response])
validate[,response] <- as.factor(validate[,response])
test[,response] <- as.factor(test[,response])
# GBM Algorithm with minor human tuning
# One Hot Encoded Variables as it usually improves RMSE
gbmFit <- h2o.gbm(x = predictors,
                   y = response,
                   training_frame = train,
                   model_id = "gbmFit",
                   validation_frame = validate,
                   ntrees = 500,
                   score_tree_interval = 5,
                   stopping_rounds = 3,
                   stopping_metric = "RMSE",
                   stopping_tolerance = 0.0005,
                   categorical_encoding = "OneHotExplicit",
                   seed = 1)
gbmPerf <- h2o.performance(model = gbmFit,
                            newdata = test)
gbmPerftr <- h2o.performance(model = gbmFit,
                            newdata = train)
print(gbmPerf)
print(gbmPerftr)
# Distributed RandomForest
rfFit <- h2o.randomForest(x = predictors,
                           y = response,
                           training_frame = train,
                           model_id = "rfFit",
                           seed = 1,
                           nfolds = 5)
rfPerf <- h2o.performance(model = rfFit,
                           newdata = test)
rfPerftr <- h2o.performance(model = rfFit,
                           newdata = train)
print(rfPerf)
print(rfPerftr)
# Generalized Linear Model with gaussian Family
glmFit <- h2o.glm( x = predictors, 
                    y = response, 
                    training_frame = train,
                    model_id = "glmFit",
                    validation_frame = validate,
                    family = "gaussian",
                    lambda_search = TRUE)
glmPerf <- h2o.performance(model = glmFit,
                            newdata = test)
glmPerftr <- h2o.performance(model = glmFit,
                            newdata = train)
print(glmPerf)
print(glmPerftr)
# Lets examine the results based on RMSE
# RF/GBM performed quite close 
# GBM had the best RMSE but it was also the slowest
# RMSE
rfper <-h2o.rmse(rf_perf)  # 7.471413
gbmper <- h2o.rmse(gbm_perf) # 7.70153
glmper <- h2o.rmse(glm_perf) # 10.11961

rfpertr <-h2o.rmse(rf_perf_train)  # 3.467813
gbmpertr <- h2o.rmse(gbm_perf_train) # 7.604088
glmpertr <- h2o.rmse(glm_perf_train) # 10.68962



rf <- cbind(rfper, rfpertr)
gbm <- cbind(gbmper, gbmpertr)
glm <- cbind(glmper, glmpertr)

final <- rbind(rf,gbm,glm)
colnames(final) <- c("Test","Train")
rownames(final) <- c("RF","GBM","GLM")
print(final)

Results are below

Test Train
RF 7.471413 3.467813
GBM 7.701530 7.604088
GLM 10.119612 10.689616

RF is always generate good result. But we have to careful when we select it. Because in Train, it looks like overfitting. Therefore I will selecet GBM and do hyperparameter tuning to it.

# Variable Importance
# We see that for GBM the new feature (Age) we created was the most important
# for RF age was the 5th most important

print(gbm_fit@model$variable_importances)
print(rf_fit@model$variable_importances)

Variable Importances for GBM:

variable relative_importance scaled_importance percentage
1 days_live 16982456.000000 1.000000 0.501079
2 price 5190817.000000 0.305658 0.153159
3 clicked.0 4012160.500000 0.236253 0.118382
4 age 1407525.750000 0.082881 0.041530
5 label.volkswagen polo 725675.562500 0.042731 0.021412
variable relative_importance scaled_importance percentage
2446 label.westfield cabriolet 0.000000 0.000000 0.000000
2447 label.westfield se k6 0.000000 0.000000 0.000000
2448 label.westfield westfield 0.000000 0.000000 0.000000
2449 label.xxtrail na 0.000000 0.000000 0.000000
2450 label.zundapp fabia 0.000000 0.000000 0.000000
2451 label.missing(NA) 0.000000 0.000000 0.000000

Variable Importances for RF:

variable relative_importance scaled_importance percentage
1 days_live 102245984.000000 1.000000 0.239505
2 photo_cnt 48573184.000000 0.475062 0.113780
3 price 43279472.000000 0.423288 0.101379
4 label 29833160.000000 0.291778 0.069882
5 age 23406868.000000 0.228927 0.054829
6 kleur 19365180.000000 0.189398 0.045362
7 kmstand 19202722.000000 0.187809 0.044981
8 clicked 19151188.000000 0.187305 0.044860
9 energielabel 17910164.000000 0.175167 0.041953
10 annual_emissions 17643446.000000 0.172559 0.041329
11 annual_kms 16709458.000000 0.163424 0.039141
12 vermogen 14400799.000000 0.140845 0.033733
13 aantaldeuren 13734035.000000 0.134323 0.032171
14 emissie 13316308.000000 0.130238 0.031193
15 carrosserie 13147768.000000 0.128590 0.030798
16 ageGroup 8611877.000000 0.084227 0.020173
17 aantalstoelen 6373984.500000 0.062340 0.014931
# Since GBM was the best performer lets tune it
# Hyper Parameter Tuning 
# First Pass
hyper_params = list( max_depth = seq(1,29,2) ) # Since dataqset is small
grid <- h2o.grid(
  hyper_params = hyper_params,
  ## full Cartesian hyper-parameter search
  search_criteria = list(strategy = "Cartesian"),
  algorithm="gbm",
  grid_id="depth_grid",
  x = predictors, 
  y = response, 
  training_frame = train, 
  validation_frame = validate,
  ## here, use "more than enough" trees - we have early stopping
  ntrees = 10000,                                                            
  ## since we have learning_rate_annealing, we can afford to start with a bigger learning rate
  learn_rate = 0.05,                                                         
  ## learning rate annealing: learning_rate shrinks by 1% after every tree 
  learn_rate_annealing = 0.99,                                               
  sample_rate = 0.8,                                                       
  col_sample_rate = 0.8, 
  seed = 1234,                                                             
  ## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
  stopping_rounds = 5,
  stopping_tolerance = 1e-4,
  stopping_metric = "RMSE", 
  ## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
  score_tree_interval = 10                                                
)
## sort the grid models by decreasing AUC
sortedGrid <- h2o.getGrid("depth_grid", sort_by="rmse", decreasing = TRUE) 
## find the range of max_depth for the top 5 models
topDepths = sortedGrid@summary_table$max_depth[1:5]                       
minDepth = min(as.numeric(topDepths))
maxDepth = max(as.numeric(topDepths))
# Now that we know a good range for max_depth, 
# we can tune all other parameters in more detail 
# Since we don’t know what combinations of hyper-parameters will result in the best model, 
# we’ll use random hyper-parameter search 
hyper_params = list( 
  ## restrict the search to the range of max_depth established above
  max_depth = seq(minDepth,maxDepth,1),                                      
  sample_rate = seq(0.2,1,0.01),                                             
  col_sample_rate = seq(0.2,1,0.01),                                         
  col_sample_rate_per_tree = seq(0.2,1,0.01),                                
  col_sample_rate_change_per_level = seq(0.9,1.1,0.01),                      
  min_rows = 2^seq(0,log2(nrow(train))-1,1),                                 
  nbins = 2^seq(4,10,1),                                                     
  nbins_cats = 2^seq(4,12,1),                                                
  min_split_improvement = c(0,1e-8,1e-6,1e-4),                               
  histogram_type = c("UniformAdaptive","QuantilesGlobal","RoundRobin")       
)
search_criteria = list(
  ## Random grid search
  strategy = "RandomDiscrete",      
  ## limit the runtime to 60 minutes
  max_runtime_secs = 3600,         
  ## build no more than 100 models
  max_models = 100,                  
  seed = 1234,                        
  ## early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference
  stopping_rounds = 5,                
  stopping_metric = "RMSE",
  stopping_tolerance = 1e-3
)
grid <- h2o.grid(
  hyper_params = hyper_params,
  search_criteria = search_criteria,
  algorithm = "gbm",
  grid_id = "final_grid", 
  x = predictors, 
  y = response, 
  training_frame = train, 
  validation_frame = validate,
  ntrees = 10000,                                                            
  learn_rate = 0.05,                                                         
  learn_rate_annealing = 0.99,                                               
  max_runtime_secs = 3600,                                                 
  stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "RMSE", 
  score_tree_interval = 10,
  nfolds = 5,
  seed = 1234                                                             
)
## Sort the grid models by RMSE
sortedGrid <- h2o.getGrid("final_grid", sort_by = "rmse", decreasing = TRUE)    
print(sortedGrid)
# Choose Best Model
gbm <- h2o.getModel(sortedGrid@model_ids[[1]])
print(h2o.rmse(h2o.performance(gbm, newdata = test))) 

Final RMSE is 6.924783

# Keeping the same “best” model,
# we can make test set predictions as follows:
preds <- h2o.predict(gbm, test)
head(preds, 10)
summary(preds)
# Final GBM Metrics
print(gbm@model$validation_metrics@metrics$r2)

R2 is 0.500924

# Save Model and Predictions
h2o.saveModel(gbm, "/Users/mustafaozturk/Desktop/eBay Case/best_model.csv", force=TRUE)
h2o.exportFile(preds, "/Users/mustafaozturk/Desktop/eBay Case/best_preds.csv", force=TRUE)
|```